#coding:utf-8
import requests,re, json, time
'''
猫眼电影top100信息抓取
'''
def getPage(url):
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    #请求页面
    response = requests.get(url, headers=headers)
    #页面返回成功，通过正则获取参数
    if response.status_code == 200:
        patten = re.compile(r'<dd>.*?board-index.*?>(.*?)</i>.*?title="(.*?)".*?board-img.*?src="(.*?)".*?</a>.*?<p.*?star.*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>.*?.*?integer.*?>(.*?)</i>.*?fraction.*?>(.*?)</i>', re.S)
        items = re.findall(patten, response.text)
        #组装数据结构
        for item in items:
            content =  {
                'index':item[0],
                'title':item[1],
                'image':item[2],
                'actor':item[3].strip(),
                'time':item[4].strip(),
                'score':item[5].strip() + item[6].strip()
            }
            #把结果写进文本中
            with open('top100.txt', 'a', encoding='utf-8') as f:
                f.write(json.dumps(content, ensure_ascii=False) + '\n')

if __name__ == '__main__':
    for i in range(10):
        page = 'http://maoyan.com/board/4?offset=' + str(i*10)
        getPage(page)
        time.sleep(1)
